Faster R-CNN Setup

We first add Faster R-CNN to our PYTHONPATH. Faster-RCNN will use a pretrained neural net that was trained on IMAGENET. We leverage this pre-existing model (VGG16) to extract bounding boxes for our potential classes. As mentioned in our paper, this pre-trained image net relies on labeled data, so effectively, we're leveraging it in order to make a more finely tuned dataset. However, if we want to recognize completely novel objects, we can simply use DeepBox - that is, we don't need to have ANY prior dataset. We emphasize this in our paper as well.

import os
import sys

#Set the correct environment variables

#Add python to the system path so that python can find the package

#Add R-CNN Tools to the system path -> This in turn adds the correct R-CNN paths, see in the Tools folder

#From the demo py-faster-rcnn script <- Using a pre-trained neural network

import _init_paths
from fast_rcnn.config import cfg
from fast_rcnn.test import im_detect
from fast_rcnn.nms_wrapper import nms
from utils.timer import Timer
import matplotlib.pyplot as plt
import numpy as np
import as sio
import caffe, os, sys, cv2
import argparse

CLASSES = ('__background__',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat', 'chair',
           'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor')

NETS = {'vgg16': ('VGG16',
        'zf': ('ZF',

def vis_detections(im, class_name, dets, thresh=0.5):
    """Draw detected bounding boxes."""
    inds = np.where(dets[:, -1] >= thresh)[0]
    if len(inds) == 0:

    im = im[:, :, (2, 1, 0)]
    fig, ax = plt.subplots(figsize=(12, 12))
    ax.imshow(im, aspect='equal')
    for i in inds:
        bbox = dets[i, :4]
        score = dets[i, -1]

            plt.Rectangle((bbox[0], bbox[1]),
                          bbox[2] - bbox[0],
                          bbox[3] - bbox[1], fill=False,
                          edgecolor='red', linewidth=3.5)
        ax.text(bbox[0], bbox[1] - 2,
                '{:s} {:.3f}'.format(class_name, score),
                bbox=dict(facecolor='blue', alpha=0.5),
                fontsize=14, color='white')

    ax.set_title(('{} detections with '
                  'p({} | box) >= {:.1f}').format(class_name, class_name,
def filter_detections(im, dets, thresh=0.5):
    inds = np.where(dets[:, -1] >= thresh)[0]
    if len(inds) == 0:
    return dets[inds]

def detectAndDrawObjects(net, image):
    """Detect object classes in an image using pre-computed object proposals."""
    # Load the demo image
    if(type(image) is str): #If given a string image path name, read in the image
        im = cv2.imread(image)
    else: #Otherwise, assume that image is the actual image
        im = image

    # Detect all object classes and regress object bounds
    timer = Timer()
    scores, boxes = im_detect(net, im)
    print ('Detection took {:.3f}s for '
           '{:d} object proposals').format(timer.total_time, boxes.shape[0])

    # Visualize detections for each class
    CONF_THRESH = 0.8
    NMS_THRESH = 0.3
    for cls_ind, cls in enumerate(CLASSES[6:8]): #We only care about the car and bus classes
        cls_ind += 6 # because we start at the 6th class
        cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)]
        cls_scores = scores[:, cls_ind]
        dets = np.hstack((cls_boxes,
                          cls_scores[:, np.newaxis])).astype(np.float32)
        keep = nms(dets, NMS_THRESH)
        dets = dets[keep, :]

        vis_detections(im, cls, dets, thresh=CONF_THRESH)
def detectAndReturnObjects(net, image):
    """Detect object classes in an image using pre-computed object proposals."""
    # Load the demo image
    if(type(image) is str): #If given a string image path name, read in the image
        im = cv2.imread(image)
    else: #Otherwise, assume that image is the actual image
        im = image
    # Detect all object classes and regress object bounds
    timer = Timer()
    scores, boxes = im_detect(net, im)

    # Visualize detections for each class
    CONF_THRESH = 0.8
    NMS_THRESH = 0.3
    boxesArray = np.empty((0,5), float)
    for cls_ind, cls in enumerate(CLASSES[6:8]): #We only care about the car and bus classes
        cls_ind += 6 # because we start at the 6th class
        cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)]
        cls_scores = scores[:, cls_ind]
        dets = np.hstack((cls_boxes,
                          cls_scores[:, np.newaxis])).astype(np.float32)
        keep = nms(dets, NMS_THRESH)
        dets = dets[keep, :]
        tempBoxes = filter_detections(im, dets, thresh=CONF_THRESH)
        if tempBoxes is not None:
            boxesArray = np.vstack((boxesArray, tempBoxes))

    return boxesArray

cfg.TEST.HAS_RPN = True  # Use RPN for proposals

prototxt = os.path.join(cfg.MODELS_DIR, NETS['vgg16'][0],
                        'faster_rcnn_alt_opt', '')
caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models',

if not os.path.isfile(caffemodel):
    raise IOError(('{:s} not found.\nDid you run ./data/script/'

#Train on GPU ID 0 <- Only works if you've configured CUDA and your GPU correctly
cfg.GPU_ID = 0
net = caffe.Net(prototxt, caffemodel, caffe.TEST)

print '\n\nLoaded network {:s}'.format(caffemodel)

#Take our test images and run R-CNN on them
im_path = '/home/thomas/pixy/Test_Cars/'
im_names = ['1.png', '2.png', '3.png']
im_names = [im_path + tempIm for tempIm in im_names]

for im_name in im_names:
    detectAndDrawObjects(net, im_name)

Loaded network /home/thomas/py-faster-rcnn/data/faster_rcnn_models/VGG16_faster_rcnn_final.caffemodel
Detection took 0.306s for 300 object proposals
Detection took 0.152s for 300 object proposals
Detection took 0.155s for 300 object proposals

Now let's just get the boxes out. Each box entry will have the format $\begin{bmatrix} x1 & y1 & x2 & y2 & score \end{bmatrix}$ where $0 \leq score \leq 1$.

import scipy.misc

cfg.TEST.HAS_RPN = True  # Use RPN for proposals

prototxt = os.path.join(cfg.MODELS_DIR, NETS['vgg16'][0],
                        'faster_rcnn_alt_opt', '')
caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models',

if not os.path.isfile(caffemodel):
    raise IOError(('{:s} not found.\nDid you run ./data/script/'

#Train on GPU ID 0 <- Only works if you've configured CUDA and your GPU correctly
cfg.GPU_ID = 0
net = caffe.Net(prototxt, caffemodel, caffe.TEST)

print '\n\nLoaded network {:s}'.format(caffemodel)

#Take our test images and run R-CNN on them
im_path = '/home/thomas/pixy/Test_Cars/'
im_names = ['1.png', '2.png', '3.png']
im_names = [im_path + tempIm for tempIm in im_names]
output_path = '/home/thomas/pixy/Test_Cars_Output/'
outputCounter = 0

#For some reason, cv2 just won't write the images if the directory doesn't exist
if not os.path.isdir(output_path):

for im_name in im_names:
    boxes = detectAndReturnObjects(net, im_name)
    for box in boxes:
        im = cv2.imread(im_name)
        box = [int(num) for num in box]
        boxIm = im[box[1]:box[3], box[0]:box[2],:]
        #Write the segmented output
        cv2.imwrite(output_path + str(outputCounter) + '.jpg',boxIm)
        outputCounter += 1
        boxIm = cv2.cvtColor(boxIm, cv2.COLOR_BGR2RGB) #matplotlib uses a different storage format than cv2, this is just for displaying images
        plt.imshow(boxIm) #Rows of the image are the y axis, columns are the x axis

Loaded network /home/thomas/py-faster-rcnn/data/faster_rcnn_models/VGG16_faster_rcnn_final.caffemodel

R-CNN does a great job extracting the cars from our image. Now, let's run it on the real dataset. We'll be using images from the CityScapes dataset. Unfortunately, due to licensing issues, we can't include the CityScapes dataset in our repository (also because we can't upload 10 GB of images to GitHub). To gain access to the dataset, register at and place the training images in the PIXY_PATH/CityScapes/ folder. The training images look extremely similar to the three images provided in the Test_Cars folder.

import scipy.misc
from tqdm import tqdm
from os import listdir
from os.path import isfile, join

cfg.TEST.HAS_RPN = True  # Use RPN for proposals

prototxt = os.path.join(cfg.MODELS_DIR, NETS['vgg16'][0],
                        'faster_rcnn_alt_opt', '')
caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models',

if not os.path.isfile(caffemodel):
    raise IOError(('{:s} not found.\nDid you run ./data/script/'

#Train on GPU ID 0 <- Only works if you've configured CUDA and your GPU correctly
cfg.GPU_ID = 0
net = caffe.Net(prototxt, caffemodel, caffe.TEST)

print '\n\nLoaded network {:s}'.format(caffemodel)

#Take our test images and run R-CNN on them
im_path = '/home/thomas/pixy/CityScapes/'
im_names = onlyfiles = [f for f in listdir(im_path) if isfile(join(im_path, f))]
im_names = [im_path + tempIm for tempIm in im_names]
output_path = '/home/thomas/pixy/CityScapes_Output/'
outputCounter = 0

#For some reason, cv2 just won't write the images if the directory doesn't exist
if not os.path.isdir(output_path):

for im_name in tqdm(im_names):
    boxes = detectAndReturnObjects(net, im_name)
    for box in boxes:
        im = cv2.imread(im_name)
        box = [int(num) for num in box]
        boxIm = im[box[1]:box[3], box[0]:box[2],:]
        #Write the segmented output
        cv2.imwrite(output_path + str(outputCounter) + '.jpg',boxIm)
        outputCounter += 1

Loaded network /home/thomas/py-faster-rcnn/data/faster_rcnn_models/VGG16_faster_rcnn_final.caffemodel
100%|██████████| 2975/2975 [16:07<00:00,  3.54it/s]

print("Segmented " + str(outputCounter) + " cars!")

Segmented 7608 cars!

Visual Bag of Words

For each of our classes, we compute the SIFT features and k-means cluster in order to get our "visual words". A great overview of this method::

1) First we provide the directory of our images and split our dataset into train and testing data.

#Import necessary libraries, following is based on the PHOW script, with modifications
import phow_caltech101 as phow
from datetime import datetime
from os.path import exists
from sklearn.kernel_approximation import AdditiveChi2Sampler
from cPickle import dump, load
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score
import pylab as pl
import numpy as np
import matplotlib.pyplot as plt

#Define initial configuration setup variables
IDENTIFIER = 'cityscape1' #The identifier is a string that we use to cache our datasets with - using an identifier allows us to retrieve things from our cache later
OVERWRITE = True #Ignore the cache?
NUMTRAIN = 25 #From each directory, how many images do we want to train with to find our visual words
NUMTEST = 0 #From each directory, how many images do we want to use to test how good our visual words clustering is
NUMCLASSES = 4 #How many folders do we have, aka how many visual word clusterings do we have to do
NUMWORDS = 600 #How many words do we want to use to describe our image classes

conf = phow.Configuration(IDENTIFIER)

Finished configuring system

classes = phow.get_classes(conf.calDir, conf.numClasses)

model = phow.Model(classes, conf)

all_images, all_images_class_labels = phow.get_all_images(classes, conf)
selTrain, selTest = phow.create_split(all_images, conf)

Found classes and created split

2016-12-10 20:23:48.155656| Finished configuring system
2016-12-10 20:23:48.156441| Found classes and created split

We now use our selTrain data set and find our visual bag of words, we've specified to find 600 visual words.

# Train vocabulary
print str( + '| Start training vocabulary - launching threads to do SIFT'
if (not exists(conf.vocabPath)) | OVERWRITE:
    vocab = phow.trainVocab(selTrain, all_images, conf)
    phow.savemat(conf.vocabPath, {'vocab': vocab})
    print str( + '| Done! Using old vocab from ' + conf.vocabPath
    vocab = phow.loadmat(conf.vocabPath)['vocab']

model.vocab = vocab #The columns of vocab are our visual words

2016-12-10 20:23:49.438075| Start training vocabulary - launching threads to do SIFT
2016-12-10 20:23:49.438195| Done! Using old vocab from tempresults/

Calculate amount of "contribution" each word has to a specific image. (Imagine that the vocab is a set of singular vectors and we're computing our singular values.)

First, we pull out the SIFT vector from each image.

# Compute spatial histograms
print str( + '| Computing Spatial Histograms'
if (not exists(conf.histPath)) | OVERWRITE:
    hists = phow.computeHistograms(all_images, model, conf, vocab)
    phow.savemat(conf.histPath, {'hists': hists})
    print str( + '| Found old historams at:' + conf.histPath
    hists = phow.loadmat(conf.histPath)['hists']

2016-12-10 20:23:51.107821| Computing Spatial Histograms
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   15.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   30.6s remaining:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   30.6s finished

After that, we use the AdditiveChi2Sampler from sci-kit learn to do a feature mapping (analogously calculating the singular values).

# Compute feature map
print str( + '| Computing Feature Map'
transformer = AdditiveChi2Sampler()
histst = transformer.fit_transform(hists)
train_data = histst[selTrain]
test_data = histst[selTest]

2016-12-10 20:24:35.433906| Computing Feature Map

Fit our word vectors with the Logistic Regression Model using 1, 2, 3, and 4 as our class labels -> corresponding to Bus, Sedan, SUV, and Van.

y = [1] * 25
y = y + [2] * 25
y = y + [3] * 25
y = y + [4] * 25

from sklearn import linear_model

logreg = linear_model.LogisticRegression(C=1e5), y)


array([[  3.09279993e-05,   9.99866188e-01,   1.00075198e-04,

As expected, when given a picture from our training dataset of a Bus, Logistic Regression classifies it as a bus with .99966 percent probability. Note that logistic regression normalizes the probabilities such that the class label probabilities sum to 1.

Now, let's do the visual bag of words on our testing dataset. First, we have to extract the cars from our test dataset.

import scipy.misc
from tqdm import tqdm
from os import listdir
from os.path import isfile, join

cfg.TEST.HAS_RPN = True  # Use RPN for proposals

prototxt = os.path.join(cfg.MODELS_DIR, NETS['vgg16'][0],
                        'faster_rcnn_alt_opt', '')
caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models',

if not os.path.isfile(caffemodel):
    raise IOError(('{:s} not found.\nDid you run ./data/script/'

#Train on GPU ID 0 <- Only works if you've configured CUDA and your GPU correctly
cfg.GPU_ID = 0
net = caffe.Net(prototxt, caffemodel, caffe.TEST)

print '\n\nLoaded network {:s}'.format(caffemodel)

#Take our test images and run R-CNN on them
im_path = '/home/thomas/pixy/CityScapes_Test/'
im_names = onlyfiles = [f for f in listdir(im_path) if isfile(join(im_path, f))]
im_names = [im_path + tempIm for tempIm in im_names]
output_path = '/home/thomas/pixy/CityScapes_Test_Output/'
outputCounter = 0

#For some reason, cv2 just won't write the images if the directory doesn't exist
if not os.path.isdir(output_path):

for im_name in tqdm(im_names):
    boxes = detectAndReturnObjects(net, im_name)
    for box in boxes:
        im = cv2.imread(im_name)
        box = [int(num) for num in box]
        boxIm = im[box[1]:box[3], box[0]:box[2],:]
        #Write the segmented output
        cv2.imwrite(output_path + str(outputCounter) + '.jpg',boxIm)
        outputCounter += 1

Loaded network /home/thomas/py-faster-rcnn/data/faster_rcnn_models/VGG16_faster_rcnn_final.caffemodel
100%|██████████| 2025/2025 [11:52<00:00,  3.70it/s]

print("Segmented " + str(outputCounter) + " cars!")

Segmented 6200 cars!

Skip to here if you've already segemented your testing set:

from os import listdir
from os.path import isfile, join

output_path = '/home/thomas/pixy/CityScapes_Test_Output/'

out_names = onlyfiles = [f for f in listdir(output_path) if isfile(join(output_path, f))]
out_names = ['CityScapes_Test_Output/' + tempIm for tempIm in out_names]

tempHistPath = 'tempresults/'
OVERWRITE = True #We really don't want to ignore the cache for this, it takes a very long time to compute

# Compute spatial histograms
print str( + '| Computing Spatial Histograms'
if (not exists(tempHistPath)) | OVERWRITE:
    out_hists = phow.computeHistograms(out_names, model, conf, vocab)
    phow.savemat(tempHistPath, {'hists': out_hists})
    print str( + '| Found old historams at:' + tempHistPath
    out_hists = phow.loadmat(tempHistPath)['hists']

2016-12-10 20:24:51.505217| Computing Spatial Histograms
# Compute feature map
print str( + '| Computing Feature Map'
transformer = AdditiveChi2Sampler()
out_histst = transformer.fit_transform(out_hists)

2016-12-10 20:46:30.228778| Computing Feature Map

Out class labels are in the form $\begin{bmatrix} P(Bus|Features) & P(Sedan|Features) & P(SUV|Features) & P(Van|Features) \end{bmatrix}$.

array([[-16.14175298,   0.64812159,  -0.03645473,  -7.9996721 ]])

array([[  8.50855348e-08,   5.72034643e-01,   4.27673011e-01,

out_proba = np.empty((0,4), float)

for out_hist in out_histst:
    out_proba = np.vstack((out_proba, logreg.predict_proba(out_hist.reshape(1,-1))))


[[  8.50855348e-08   5.72034643e-01   4.27673011e-01   2.92260995e-04]
 [  7.63422631e-04   7.02903436e-04   9.98349174e-01   1.84500170e-04]
 [  1.53968925e-02   3.55315097e-03   9.80337285e-01   7.12671257e-04]
 [  7.18620202e-02   3.74637245e-03   9.24309666e-01   8.19413858e-05]
 [  1.63482686e-05   8.82040689e-01   1.17942681e-01   2.82166680e-07]
 [  3.11203657e-02   1.67834902e-01   8.00852605e-01   1.92127411e-04]]

We toggle a label (1) if there is more than a 20 percent chance of it being classified as that class.

out_labels_idx = np.argwhere(out_proba>= .20)


out_labels = np.zeros(out_proba.shape, float)
for entry in out_labels_idx:
    out_labels[entry[0], entry[1]] = 1

Now that we have our overall labeling matrix, let's feed it into snorkel.

Snorkel Installation and Setup

First, let's load snorkel into our Jupyter python path. For some reason, the snorkel installation doesn't always work so this is here as a backup. Make sure that you follow the installation instructions in the README as well before executing any code.

import os
import sys

#Set the correct environment variables

#Add python to the system path so that python can find the package

%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
from snorkel import SnorkelSession
session = SnorkelSession()

Fitting the Generative Model

We estimate the accuracies of the labeling functions without supervision. Specifically, we estimate the parameters of a NaiveBayes generative model.

First, we have to specify a sparse matrix with labeling function output. The setup of the matrix is as follows:

Rows of the matrix correspond with individual test images Columns of the matrix correspond to individual labeling functions Entries in the matrix are {-1, 0, 1}, the possible outputs of each labeling function

We first fit our generative model to the first class label (the first column). Therefore, we negate the output in the 2nd to 4th columns.

In [75]:

bus_labels = out_labels
bus_labels[:,1] = np.negative(out_labels[:,1])
bus_labels[:,2] = np.negative(out_labels[:,2])
bus_labels[:,3] = np.negative(out_labels[:,3])

#Convert our sparse array into a format that snorkel knows how to deal with
from snorkel.annotations import csr_LabelMatrix
from snorkel.annotations import csr_AnnotationMatrix

bus_label_matrix = csr_LabelMatrix(csr_AnnotationMatrix(bus_labels))

In [132]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
gen_model.train(bus_label_matrix, n_iter=1000, rate=1e-5)


In [133]:
train_marginals = gen_model.marginals(bus_label_matrix)


In [134]:
from snorkel.learning import LogReg
from snorkel.learning_utils import RandomSearch, ListParameter, RangeParameter

iter_param = ListParameter('n_iter', [250, 500, 1000, 2000])
rate_param = RangeParameter('rate', 1e-4, 1e-2, step=0.75, log_base=10)
reg_param  = RangeParameter('mu', 1e-8, 1e-2, step=1, log_base=10)

disc_model = LogReg()

In [135]:
searcher = RandomSearch(disc_model, bus_label_matrix, train_marginals, 10, iter_param, rate_param, reg_param)

disc_model.train(bus_label_matrix, train_marginals, n_iter=1000, rate=0.001)

denoisedMatrix = disc_model.predict(bus_label_matrix)

onlyBus = np.where(denoisedMatrix == 1)[0]

onlyVan = np.where(denoisedMatrix == 1)[0]

onlySUV = np.where(denoisedMatrix == 1)[0]

0.6799709970887919


onlyCars = np.where(denoisedMatrix == 1)[0]

MLELabels = [np.argmax(tempProb) for tempProb in out_proba]

BusArray = np.where(np.asarray(MLELabels) == 1)[0]
SedanArray = np.where(np.asarray(MLELabels) == 2)[0]
SUVArray = np.where(np.asarray(MLELabels) == 3)[0]
VanArray = np.where(np.asarray(MLELabels) == 4)[0]

